Importing the data

# DATA
chrD=read.table(file ="filtered_recombination_data_withBin")

names(chrD)=c("R_ID", "Chrm","PhysD","Pos3.0+1","Index",
              "BTAU","BTAU+1","cM","ID1","Database",
              "ID2","Type","Informative_Meiosis",
              "Kosambi_distance","BinID")

head(chrD)
##   R_ID Chrm   PhysD Pos3.0+1 Index    BTAU  BTAU+1     cM        ID1
## 1    1    1  822710   822711     1  620748  620749  3.040 rs29017090
## 2    2    1  822930   822931     2  620968  620969  2.845 rs29017089
## 3    3    1 2049815  2049816     3 1857578 1857579  2.531 rs29015832
## 4    4    1 2049855  2049856     4 1857618 1857619  2.843 rs29015833
## 5    5    1 2305629  2305630     5 2114048 2114049 14.092 rs29026024
## 6    6    1 2994006  2994007     6 2792672 2792673  0.985 rs29027420
##   Database        ID2 Type Informative_Meiosis Kosambi_distance BinID
## 1     NCBI rs29017090  SNP                 346            5.703     1
## 2     NCBI rs29017089  SNP                 359            0.195     1
## 3     NCBI rs29015832  SNP                 173            0.312     2
## 4     NCBI rs29015833  SNP                 316            0.002     2
## 5     NCBI rs29026024  SNP                  37            0.999     3
## 6     NCBI rs29027420  SNP                 233            1.082     3
##############

Looking at the trend on chromsome 1

We are just examining closer the chromosome 1 as an example but this trend is quite repeatable among all chromosomes

The cubic fit of genetic distances along the chromosome

We use a cubic fit using least squares for fitting (standard polynomial regression). Note that we use the following convention when regression genetic distance (in cM) against physical distance D (in Mb) \[ cM = a + b D + C D^2 + d D^3 \].

CubReg=lm(cM~PhysD + I(PhysD^2) + I(PhysD^3), 
            data=chrD_selectionChrm)
summary(CubReg)
## 
## Call:
## lm(formula = cM ~ PhysD + I(PhysD^2) + I(PhysD^3), data = chrD_selectionChrm)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.6379 -1.3776 -0.2759  2.0150  7.0026 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.829e+00  4.744e-01   8.072 1.06e-14 ***
## PhysD        1.432e+00  2.723e-02  52.609  < 2e-16 ***
## I(PhysD^2)  -8.059e-03  4.101e-04 -19.650  < 2e-16 ***
## I(PhysD^3)   3.318e-05  1.727e-06  19.211  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.579 on 358 degrees of freedom
## Multiple R-squared:  0.9962, Adjusted R-squared:  0.9961 
## F-statistic: 3.103e+04 on 3 and 358 DF,  p-value: < 2.2e-16
  # Terms of the of the cubic equation:
  CubReg$coefficients
##   (Intercept)         PhysD    I(PhysD^2)    I(PhysD^3) 
##  3.829346e+00  1.432352e+00 -8.059209e-03  3.317595e-05
  a = CubReg$coefficients[1]
  b = CubReg$coefficients[2]
  c = CubReg$coefficients[3]
  d = CubReg$coefficients[4]

Checking visually the polynomial fit

plot(chrD_selectionChrm$PhysD,chrD_selectionChrm$cM, pch=20, cex=0.25, col="grey30",
       xlab="Cumulated Physical distance (Mb)", ylab="CUMULATED cM", main=paste("Obs versus fitted trend on chr ", chrm))
lines(chrD_selectionChrm$PhysD,CubReg$fitted.values, type = "l", col="cornflowerblue", lwd=2)

Fitting the whole set of chromosomes

listChr=unique(chrD$Chrm)
Toscale=10^6 # I want to have Mb as physical distance units
for (chrm in 1:29){
  chrD_selectionChrm = chrD[chrD[,2]==chrm,]
  chrD_selectionChrm$PhysD=chrD_selectionChrm$PhysD/Toscale
  CubReg=lm(cM~PhysD + I(PhysD^2) + I(PhysD^3), 
            data=chrD_selectionChrm)
  plot(chrD_selectionChrm$PhysD,chrD_selectionChrm$cM, pch=20, cex=0.25,
       col="grey30", 
       xlab="Cumulated Physical distance (Mb)", ylab="CUMULATED cM",
       main=paste("Obs versus fitted trend on chr ", chrm))
  lines(chrD_selectionChrm$PhysD,CubReg$fitted.values, type = "l",
        col="cornflowerblue", lwd=2)
}

Overview of the variation in local recombination rates throughout the genome

## Examine and parse the estimated local recombination rates from Belen's script.

library(tidyverse) 
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
df=read_csv("local_recomb_rate_per_bin_NEW.txt",col_names = c("chr","bin","begin","end","mid", "cMperMb"),progress = T)
## Parsed with column specification:
## cols(
##   chr = col_integer(),
##   bin = col_integer(),
##   begin = col_double(),
##   end = col_double(),
##   mid = col_double(),
##   cMperMb = col_double()
## )
names(df)
## [1] "chr"     "bin"     "begin"   "end"     "mid"     "cMperMb"
glimpse(df)
## Observations: 1,936
## Variables: 6
## $ chr     <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ bin     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ begin   <dbl> 822710, 2049815, 2305629, 3553857, 4727118, 5647166, 6...
## $ end     <dbl> 822930, 2049855, 3249057, 3601437, 4784221, 5741816, 7...
## $ mid     <dbl> 822820, 2049835, 2777343, 3577647, 4755670, 5694491, 6...
## $ cMperMb <dbl> 1.4191572, 1.3997305, 1.3883537, 1.3759603, 1.3579495,...
head(df)
## # A tibble: 6 x 6
##     chr   bin   begin     end      mid cMperMb
##   <int> <int>   <dbl>   <dbl>    <dbl>   <dbl>
## 1     1     1  822710  822930  822820     1.42
## 2     1     2 2049815 2049855 2049835     1.40
## 3     1     3 2305629 3249057 2777343     1.39
## 4     1     4 3553857 3601437 3577647     1.38
## 5     1     5 4727118 4784221 4755670.    1.36
## 6     1     6 5647166 5741816 5694491     1.34
dim(df)
## [1] 1936    6
summary(df)
##       chr             bin             begin               end           
##  Min.   : 1.00   Min.   :  1.00   Min.   :  169363   Min.   :     -Inf  
##  1st Qu.: 5.00   1st Qu.: 17.00   1st Qu.:25052622   1st Qu.: 12032212  
##  Median :11.00   Median : 34.00   Median :50154374   Median : 37050024  
##  Mean   :12.28   Mean   : 37.74   Mean   :     Inf   Mean   :     -Inf  
##  3rd Qu.:19.00   3rd Qu.: 54.00   3rd Qu.:89438273   3rd Qu.: 64798742  
##  Max.   :29.00   Max.   :126.00   Max.   :     Inf   Max.   :157837373  
##                                                                         
##       mid               cMperMb      
##  Min.   :   402021   Min.   :0.5574  
##  1st Qu.: 21904767   1st Qu.:0.9301  
##  Median : 43339789   Median :1.1834  
##  Mean   : 48517362   Mean   :1.2855  
##  3rd Qu.: 69209223   3rd Qu.:1.4993  
##  Max.   :157707058   Max.   :4.6415  
##  NA's   :259         NA's   :259
df %>%  filter(!is.na(cMperMb)) -> df # excluding bins with no LRR

dim(df)
## [1] 1677    6
ggplot(data = df) + geom_histogram(aes(x=cMperMb),binwidth = 0.1)

# per chromosome
df %>% group_by(chr) %>% 
  summarise(mean=mean(cMperMb),
            median=median(cMperMb),
            Nbins=n()) ->cMperMBChrom

cMperMBChrom
## # A tibble: 29 x 4
##      chr  mean median Nbins
##    <int> <dbl>  <dbl> <int>
##  1     1 0.991  0.924   103
##  2     2 1.16   1.13     85
##  3     3 1.13   1.04     78
##  4     4 1.12   1.04     77
##  5     5 1.09   1.02     72
##  6     6 1.15   1.11     79
##  7     7 1.21   1.11     73
##  8     8 1.13   1.04     75
##  9     9 1.10   1.03     64
## 10    10 1.14   1.05     67
## # ... with 19 more rows
ggplot(data = df) + 
  geom_histogram(aes(x=cMperMb, fill=chr)) + 
  facet_wrap(~chr) +
  NULL
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.